Differential expression using the bulk RNAseq JAX_RNAseq_ExtraEmbryonic data from the December 2023 release.
There are two model systems involved in this dataset, ExM & PrS.
There are also both normoxia and hypoxia samples involved in this dataset. Need to create another column for this information.
There is only one day value.
library(ggplot2)
library(ggrepel)
library(ggpubr)
library(stringr)
library(MASS)
library(RColorBrewer)
library(DESeq2)
library(viridis)
library(ggpointdensity)
library(dplyr)
library(data.table)
library(ComplexHeatmap)
library(UpSetR)
library(readxl)
theme_set(theme_classic())
dat_path = "../../MorPhiC/data/December-2023/JAX_RNAseq_ExtraEmbryonic/"
dat_path = paste0(dat_path, "processed/Tables")meta = fread("data/December_2023/JAX_RNAseq_ExtraEmbryonic_meta_data.tsv",
data.table = FALSE)
dim(meta)## [1] 90 36
meta[1:2,]## clonal.label library_preparation.label label
## 1 MOK20010-WT GT23-10491 PrS-MOK20010-WT
## 2 MOK20010C-A06 GT23-10506 PrS-MOK20010C-A06
## description
## 1 KOLF2.2 derived primitive syncytium
## 2 KOLF2.2 deleted POU2F3 by deletion of critical exon (CE) derived primitive syncytium
## differentiated_product_protocol_id model_system timepoint_value
## 1 JAXPD001 PrS 6
## 2 JAXPD001 PrS 6
## timepoint_unit final_timepoint treatment_condition wt_control_status
## 1 days Yes hypoxia WT
## 2 days Yes Not applicable KO
## ko_strategy ko_gene library_preparation.description
## 1 WT WT NA
## 2 CE POU2F3 NA
## library_preparation.library_preparation_protocol_id
## 1 JAXPL001
## 2 JAXPL001
## library_preparation.average_fragment_size
## 1 431
## 2 428
## library_preparation.input_amount_value library_preparation.input_amount_unit
## 1 300 ng
## 2 300 ng
## library_preparation.concentration_value
## 1 47.7
## 2 47.7
## library_preparation.concentration_unit library_preparation.total_yield_value
## 1 nM 244
## 2 nM 244
## library_preparation.total_yield_unit library_preparation.cdna_pcr_cycles
## 1 ng 10
## 2 ng 10
## library_preparation.pcr_cycles_for_indexing
## 1 Not available
## 2 Not available
## file_id
## 1 KOLF2_POU2F3_1_GT23-10491_CCGTGAAG-ATCCACTG_S38_L001
## 2 POU2F3_CE_B05_GT23-10506_GCAGAATT-TGGCCGGT_S17_L001
## run_id
## 1 20230809_23-robson-005-run2
## 2 20230809_23-robson-005-run2
## clonal.description clonal.parental_name
## 1 KOLF2.2 KOLF2.2J
## 2 KOLF2.2 deleted POU2F3 by deletion of critical exon (CE) KOLF2.2J
## clonal.clone_id clonal.type clonal.zygosity
## 1 WT iPSC Not applicable
## 2 A06 iPSC Not applicable
## clonal.cell_line_generation_protocol clonal.treatment_condition
## 1 Not applicable Not applicable
## 2 Not applicable Not applicable
## clonal.wt_control_status expression_alteration.label
## 1 WT Not applicable
## 2 KO JAX_POU2F3_Critical_exon
## model_organ
## 1 extra-embryonic primitive syncytial cells
## 2 extra-embryonic primitive syncytial cells
names(meta)## [1] "clonal.label"
## [2] "library_preparation.label"
## [3] "label"
## [4] "description"
## [5] "differentiated_product_protocol_id"
## [6] "model_system"
## [7] "timepoint_value"
## [8] "timepoint_unit"
## [9] "final_timepoint"
## [10] "treatment_condition"
## [11] "wt_control_status"
## [12] "ko_strategy"
## [13] "ko_gene"
## [14] "library_preparation.description"
## [15] "library_preparation.library_preparation_protocol_id"
## [16] "library_preparation.average_fragment_size"
## [17] "library_preparation.input_amount_value"
## [18] "library_preparation.input_amount_unit"
## [19] "library_preparation.concentration_value"
## [20] "library_preparation.concentration_unit"
## [21] "library_preparation.total_yield_value"
## [22] "library_preparation.total_yield_unit"
## [23] "library_preparation.cdna_pcr_cycles"
## [24] "library_preparation.pcr_cycles_for_indexing"
## [25] "file_id"
## [26] "run_id"
## [27] "clonal.description"
## [28] "clonal.parental_name"
## [29] "clonal.clone_id"
## [30] "clonal.type"
## [31] "clonal.zygosity"
## [32] "clonal.cell_line_generation_protocol"
## [33] "clonal.treatment_condition"
## [34] "clonal.wt_control_status"
## [35] "expression_alteration.label"
## [36] "model_organ"
table(table(meta$clonal.label))##
## 1 2
## 66 12
table(table(meta$library_preparation.label))##
## 1
## 90
table(table(meta$label))##
## 1
## 90
meta$label[1:6]## [1] "PrS-MOK20010-WT" "PrS-MOK20010C-A06" "PrS-MOK20010C-B05"
## [4] "PrS-MOK20010C-B06" "PrS-MOK20010P-A09" "PrS-MOK20010P-A10"
cell_line_id = str_split(meta$label, pattern = '-')
table(sapply(cell_line_id, length))##
## 3 4
## 66 24
cell_line_id = lapply(cell_line_id, function(x) { length(x) <- 4; x})
cell_line_id = as.data.frame(do.call(rbind, cell_line_id))
cell_line_id[1:6, ]## V1 V2 V3 V4
## 1 PrS MOK20010 WT <NA>
## 2 PrS MOK20010C A06 <NA>
## 3 PrS MOK20010C B05 <NA>
## 4 PrS MOK20010C B06 <NA>
## 5 PrS MOK20010P A09 <NA>
## 6 PrS MOK20010P A10 <NA>
meta$condition = cell_line_id[, 4]
meta$condition[which(is.na(meta$condition))] = "nor"
table(meta$condition, useNA="ifany")##
## hyp nor
## 12 78
table(meta$model_organ, meta$condition, useNA="ifany")##
## hyp nor
## extra-embryonic mesenchymal cells 0 12
## extra-embryonic primitive syncytial cells 12 66
table(meta$run_id, useNA="ifany")##
## 20230809_23-robson-005-run2 20230918_23-robson-008
## 30 24
## 20230918_23-robson-009 20230918_23-robson-010
## 12 12
## 20231004_23-robson-011
## 12
table(meta$run_id, meta$condition, useNA="ifany")##
## hyp nor
## 20230809_23-robson-005-run2 0 30
## 20230918_23-robson-008 12 12
## 20230918_23-robson-009 0 12
## 20230918_23-robson-010 0 12
## 20231004_23-robson-011 0 12
table(meta$run_id, meta$model_system, useNA="ifany")##
## ExM PrS
## 20230809_23-robson-005-run2 0 30
## 20230918_23-robson-008 0 24
## 20230918_23-robson-009 0 12
## 20230918_23-robson-010 0 12
## 20231004_23-robson-011 12 0
table(meta$model_system, meta$ko_gene, useNA="ifany")##
## EPAS1 FOSB GCM1 GRHL1 ISL1 POU2F3 PPARG WT
## ExM 0 0 0 0 9 0 0 3
## PrS 18 9 9 9 0 9 9 15
table(meta$run_id, meta$ko_gene, useNA="ifany")##
## EPAS1 FOSB GCM1 GRHL1 ISL1 POU2F3 PPARG WT
## 20230809_23-robson-005-run2 0 9 0 9 0 9 0 3
## 20230918_23-robson-008 18 0 0 0 0 0 0 6
## 20230918_23-robson-009 0 0 9 0 0 0 0 3
## 20230918_23-robson-010 0 0 0 0 0 0 9 3
## 20231004_23-robson-011 0 0 0 0 9 0 0 3
table(meta$ko_strategy, meta$ko_gene, useNA="ifany")##
## EPAS1 FOSB GCM1 GRHL1 ISL1 POU2F3 PPARG WT
## CE 6 3 3 3 3 3 3 0
## KO 6 3 3 3 3 3 3 0
## PTC 6 3 3 3 3 3 3 0
## WT 0 0 0 0 0 0 0 18
Manually correcting the gene strings in certain column names.
As of the time of running this code, all differences below are due to partial correction of the GHRL gene name.
It should be “GRHL1” instead of “GHRL1”.
It is fixed in the file_id column in meta table, but not fixed in the count data file column names yet.
cts = fread(file.path(dat_path, "genesCounts.csv"), data.table = FALSE)
dim(cts)## [1] 38592 91
cts[1:2, c(1:2, (ncol(cts)-1):ncol(cts))]## Name POU2F3_KO_F04_GT23-10504_GGTACCTT-GACGTCTT_S33_L001
## 1 ENSG00000268674 0
## 2 ENSG00000271254 1030
## PTC_A09__Hypoxia_GT23-11309_AACGTTCC-AGTACTCC_S30_L001
## 1 0
## 2 489
## PTC_D10__Hypoxia_GT23-11310_GCAGAATT-TGGCCGGT_S22_L001
## 1 0
## 2 603
setdiff(names(cts)[-1], meta$file_id)## [1] "GHRL1_CE_A05_GT23-10497_TTGGACTC-CTGCTTCC_S12_L001"
## [2] "KOLF2_GHRL1_2_GT23-10492_TTACAGGA-GCTTGTCA_S6_L001"
## [3] "GHRL1_PTC_A09_GT23-10500_TAATACAG-GTGAATAT_S2_L001"
## [4] "GHRL1_PTC_C09_GT23-10502_ATGTAAGT-CATAGAGT_S19_L001"
## [5] "GHRL1_PTC_A10_GT23-10501_CGGCGTGA-ACAGGCGC_S30_L001"
## [6] "GHRL1_CE_A08_GT23-10498_CAGTAGGC-ATTCGTCA_S39_L001"
## [7] "GHRL1_KO_D02_GT23-10494_TCTCTACT-GAACCGCG_S26_L001"
## [8] "GHRL1_KO_H04_GT23-10496_CCAAGTCT-TCATCCTT_S35_L001"
## [9] "GHRL1_CE_D07_GT23-10499_TGACGAAT-GCCTACTG_S31_L001"
## [10] "GHRL1_KO_G04_GT23-10495_CTCTCGTC-AGGTTATA_S15_L001"
setdiff(meta$file_id, names(cts)[-1])## [1] "KOLF2_GRHL1_2_GT23-10492_TTACAGGA-GCTTGTCA_S6_L001"
## [2] "GRHL1_CE_A05_GT23-10497_TTGGACTC-CTGCTTCC_S12_L001"
## [3] "GRHL1_CE_A08_GT23-10498_CAGTAGGC-ATTCGTCA_S39_L001"
## [4] "GRHL1_CE_D07_GT23-10499_TGACGAAT-GCCTACTG_S31_L001"
## [5] "GRHL1_PTC_A09_GT23-10500_TAATACAG-GTGAATAT_S2_L001"
## [6] "GRHL1_PTC_A10_GT23-10501_CGGCGTGA-ACAGGCGC_S30_L001"
## [7] "GRHL1_PTC_C09_GT23-10502_ATGTAAGT-CATAGAGT_S19_L001"
## [8] "GRHL1_KO_D02_GT23-10494_TCTCTACT-GAACCGCG_S26_L001"
## [9] "GRHL1_KO_G04_GT23-10495_CTCTCGTC-AGGTTATA_S15_L001"
## [10] "GRHL1_KO_H04_GT23-10496_CCAAGTCT-TCATCCTT_S35_L001"
## starting manually correcting count data file column names
names(cts) = gsub("GHRL", "GRHL", names(cts))
## end the manual correction
stopifnot(setequal(names(cts)[-1], meta$file_id))
meta = meta[match(names(cts)[-1], meta$file_id),]
table(names(cts)[-1] == meta$file_id)##
## TRUE
## 90
cts_dat = data.matrix(cts[,-1])
rownames(cts_dat) = cts$Name
table(meta$timepoint_value, useNA="ifany")##
## 6
## 90
meta$timepoint = as.character(meta$timepoint_value)gene_anno = fread("data/gencode_v44_primary_assembly_info.tsv")
dim(gene_anno)## [1] 62754 8
gene_anno[1:2,]## geneId chr strand start end ensembl_gene_id
## <char> <char> <char> <int> <int> <char>
## 1: ENSG00000000003.16 chrX - 100627108 100639991 ENSG00000000003
## 2: ENSG00000000005.6 chrX + 100584936 100599885 ENSG00000000005
## hgnc_symbol description
## <char> <char>
## 1: TSPAN6 tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858]
## 2: TNMD tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757]
table(rownames(cts_dat) %in% gene_anno$ensembl_gene_id)##
## TRUE
## 38592
# all ensembl gene IDs are contained in the annotation
setdiff(rownames(cts_dat), gene_anno$ensembl_gene_id)## character(0)
model_s = meta$model_system
table(model_s, useNA="ifany")## model_s
## ExM PrS
## 12 78
get_q75 <- function(x){quantile(x, probs = 0.75)}
gene_info = data.frame(Name = cts$Name,
t(apply(cts_dat, 1, tapply, model_s, min)),
t(apply(cts_dat, 1, tapply, model_s, median)),
t(apply(cts_dat, 1, tapply, model_s, get_q75)),
t(apply(cts_dat, 1, tapply, model_s, max)))
dim(gene_info)## [1] 38592 9
gene_info[1:2,]## Name ExM PrS ExM.1 PrS.1 ExM.2 PrS.2 ExM.3 PrS.3
## ENSG00000268674 ENSG00000268674 0 0 0.0 0.0 0.00 0 0 3
## ENSG00000271254 ENSG00000271254 634 387 812.5 775.5 848.25 923 948 1169
table(row.names(gene_info)==gene_info$Name, useNA="ifany")##
## TRUE
## 38592
names(gene_info)[2:9] = paste0(rep(c("ExM_", "PrS_"), times=4),
rep(c("min", "median", "q75", "max"), each=2))
dim(gene_info)## [1] 38592 9
gene_info[1:2,]## Name ExM_min PrS_min ExM_median PrS_median ExM_q75
## ENSG00000268674 ENSG00000268674 0 0 0.0 0.0 0.00
## ENSG00000271254 ENSG00000271254 634 387 812.5 775.5 848.25
## PrS_q75 ExM_max PrS_max
## ENSG00000268674 0 0 3
## ENSG00000271254 923 948 1169
summary(gene_info[,-1])## ExM_min PrS_min ExM_median PrS_median
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 0.0 Median : 0.0 Median : 2.0 Median : 1.0
## Mean : 376.3 Mean : 255.1 Mean : 530.5 Mean : 585.1
## 3rd Qu.: 120.0 3rd Qu.: 60.0 3rd Qu.: 188.5 3rd Qu.: 182.5
## Max. :512761.0 Max. :154723.0 Max. :797550.5 Max. :379480.0
## ExM_q75 PrS_q75 ExM_max PrS_max
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 1
## Median : 3.0 Median : 3.0 Median : 6.0 Median : 10
## Mean : 608.8 Mean : 712.7 Mean : 751.6 Mean : 1102
## 3rd Qu.: 228.0 3rd Qu.: 233.8 3rd Qu.: 285.0 3rd Qu.: 405
## Max. :886970.0 Max. :456738.2 Max. :928104.0 Max. :801404
table(gene_info$ExM_q75 > 0, gene_info$PrS_q75 > 0)##
## FALSE TRUE
## FALSE 12757 959
## TRUE 2139 22737
w2kp = which(gene_info$ExM_q75 > 0 | gene_info$PrS_q75 > 0)
cts_dat = cts_dat[w2kp,]
dim(cts_dat)## [1] 25835 90
gene_info = gene_info[w2kp,]
meta$read_depth = colSums(cts_dat)/1e6
meta$q75 = apply(cts_dat, 2, quantile, probs = 0.75)
ggplot(meta, aes(x=read_depth, y=q75, color=ko_strategy, shape = model_system)) +
geom_point(size=2, alpha=0.7) + ggtitle("Gene counts") +
scale_shape_manual(values = c(7, 16)) +
xlab("read depth (million)") + ylab("75 percentile of gene expression") +
labs(color = "KO type", shape = "Model") +
scale_color_brewer(palette = "Set1")meta$run_id_short = str_replace(meta$run_id, "^[^-]*-", "")
ggplot(meta, aes(x=read_depth, y=q75, color=run_id_short, shape = condition)) +
geom_point(size=2, alpha=0.7) + ggtitle("Gene counts") +
scale_shape_manual(values = c(7, 16)) +
xlab("read depth (million)") + ylab("75 percentile of gene expression") +
labs(color = "Run ID", shape = "Condition") +
scale_color_brewer(palette = "Set1")table(meta$model_system, meta$run_id_short)##
## robson-005-run2 robson-008 robson-009 robson-010 robson-011
## ExM 0 0 0 0 12
## PrS 30 24 12 12 0
ggplot(meta, aes(x=read_depth, y=q75, color=run_id_short, shape = model_system)) +
geom_point(size=2, alpha=0.7) + ggtitle("Gene counts") +
scale_shape_manual(values = c(7, 16)) +
xlab("read depth (million)") + ylab("75 percentile of gene expression") +
labs(color = "Run ID", shape = "Model") +
scale_color_brewer(palette = "Set1")sample2kp = which(meta$ko_gene == "WT")
cts_dat_m = cts_dat[,sample2kp]
meta_m = meta[sample2kp,]
summary(meta_m$q75)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 361.0 422.2 469.8 478.4 540.5 592.0
stopifnot(all(meta_m$file_id == colnames(cts_dat_m)))
q75 = apply(cts_dat_m, 1, quantile, probs=0.75)
cts_dat_n = t(cts_dat_m[q75 > 0,])
cts_dat_n = log(median(meta_m$q75)*cts_dat_n/meta_m$q75 + 1)
dim(cts_dat_n)## [1] 18 23865
sample_pca = prcomp(cts_dat_n, center = TRUE, scale. = TRUE)
names(sample_pca)## [1] "sdev" "rotation" "center" "scale" "x"
pc_scores = sample_pca$x
eigen_vals = sample_pca$sdev^2
eigen_vals[1:5]/sum(eigen_vals)## [1] 0.41336077 0.18367275 0.08208396 0.04567809 0.03608958
pvs = round(eigen_vals[1:5]/sum(eigen_vals)*100,1)
pvs## [1] 41.3 18.4 8.2 4.6 3.6
PC_df = data.frame(pc_scores)
PC_df = cbind(PC_df, meta_m)
gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=condition, color=model_system)) +
geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
scale_shape_manual(values = c(7, 15, 16, 17)) +
labs(color="Model", shape ="Condition") +
xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) +
ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) +
ggtitle("Wild type samples") +
guides(
color = guide_legend(title = NULL, order = 1),
shape = guide_legend(title = NULL, order = 2)
) +
theme(
legend.margin = margin(0, 0, 0, 0),
legend.box.just = "left",
legend.direction = "vertical"
)
print(gPC)gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=condition, color=run_id_short)) +
geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
scale_shape_manual(values = c(7, 15, 16, 17)) +
labs(color="Run id", shape ="Condition") +
xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) +
ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) +
ggtitle("Wild type samples") +
guides(
color = guide_legend(title = NULL, order = 1),
shape = guide_legend(title = NULL, order = 2)
) +
theme(
legend.margin = margin(0, 0, 0, 0),
legend.box.just = "left",
legend.direction = "vertical"
)
print(gPC)table(meta_m$run_id, meta_m$model_system)##
## ExM PrS
## 20230809_23-robson-005-run2 0 3
## 20230918_23-robson-008 0 6
## 20230918_23-robson-009 0 3
## 20230918_23-robson-010 0 3
## 20231004_23-robson-011 3 0
## Generate sample information matrix
cols2kp = c("model_system", "condition", "q75")
sample_info = meta_m[,cols2kp]
dim(sample_info)## [1] 18 3
sample_info[1:2,]## model_system condition q75
## 79 PrS nor 569.5
## 12 PrS hyp 545.0
sample_info$log_q75 = log(sample_info$q75)
dds = DESeqDataSetFromMatrix(cts_dat_m, colData=sample_info,
~ model_system + condition + log_q75)
dds = DESeq(dds)
## association with read-depth
res_rd = results(dds, name = "log_q75")
res_rd = as.data.frame(res_rd)
dim(res_rd)## [1] 25835 6
res_rd[1:2,]## baseMean log2FoldChange lfcSE stat pvalue
## ENSG00000271254 731.89279 0.0114876 0.2318513 0.04954726 0.9604832
## ENSG00000277196 46.50349 0.9817827 1.3388605 0.73329722 0.4633772
## padj
## ENSG00000271254 0.9997047
## ENSG00000277196 0.9997047
table(is.na(res_rd$padj))##
## FALSE TRUE
## 17751 8084
g0 = ggplot(res_rd %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle("WT Read depth")
print(g0)## association with model_system
dds_lrt = DESeq(dds, test="LRT", reduced = ~ condition + log_q75)
res_lrt = results(dds_lrt)
res_model_system = as.data.frame(res_lrt)
dim(res_model_system)## [1] 25835 6
res_model_system[1:2,]## baseMean log2FoldChange lfcSE stat pvalue
## ENSG00000271254 731.89279 0.0114876 0.2318513 4.06378556 0.04381218
## ENSG00000277196 46.50349 0.9817827 1.3388605 0.04552192 0.83104721
## padj
## ENSG00000271254 0.06908888
## ENSG00000277196 0.87097036
table(is.na(res_model_system$padj))##
## FALSE TRUE
## 23736 2099
g0 = ggplot(res_model_system %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle("WT Model system")
print(g0)For the December 2023 release of JAX_RNAseq_ExtraEmbryonic, there are two model systems: ExM and PrS, and two conditions, nor and hyp.
There are three gene knock out strategies.
Explore the PC plots first, to decide which level to run DESeq2 on.
Then run the DE analysis.
Since there is only one day value for each model system, do not include day value in each scatter plot.
meta$model_condition = paste(meta$model_system, meta$condition, sep="_")
table(meta$model_condition)##
## ExM_nor PrS_hyp PrS_nor
## 12 12 66
table(meta$run_id, meta$model_condition)##
## ExM_nor PrS_hyp PrS_nor
## 20230809_23-robson-005-run2 0 0 30
## 20230918_23-robson-008 0 12 12
## 20230918_23-robson-009 0 0 12
## 20230918_23-robson-010 0 0 12
## 20231004_23-robson-011 12 0 0
unique_model_condition_ss = unique(meta$model_condition)
unique_model_condition_ss = sort(unique_model_condition_ss)
for(cur_mc in unique_model_condition_ss){
print(cur_mc)
sample2kp = which(meta$model_condition == cur_mc)
cts_dat_m = cts_dat[,sample2kp]
meta_m = meta[sample2kp,]
print(table(meta_m$ko_strategy, str_extract(colnames(cts_dat_m), "^[^_]+_[^_]+")))
extracted_ko_strings = str_extract_all(colnames(cts_dat_m), "CE|KO|PTC|WT")
print(table(sapply(extracted_ko_strings, length)))
print(table(meta_m$ko_strategy, unlist(extracted_ko_strings)))
if (cur_mc == "PrS_nor"){
print("The overlap between WT and KO is not real overlap. It is due to a different part of the filename contains substring 'KO' in these three samples:")
print(meta_m$file_id[(meta_m$ko_strategy=="WT") & (unlist(extracted_ko_strings)=="KO")])
}
stopifnot(all(meta_m$file_id == colnames(cts_dat_m)))
q75 = apply(cts_dat_m, 1, quantile, probs=0.75)
cts_dat_n = t(cts_dat_m[q75 > 0,])
cts_dat_n = log(median(meta_m$q75)*cts_dat_n/meta_m$q75 + 1)
dim(cts_dat_n)
sample_pca = prcomp(cts_dat_n, center = TRUE, scale. = TRUE)
names(sample_pca)
pc_scores = sample_pca$x
eigen_vals = sample_pca$sdev^2
eigen_vals[1:5]/sum(eigen_vals)
pvs = round(eigen_vals[1:5]/sum(eigen_vals)*100,1)
pvs
PC_df = data.frame(pc_scores)
PC_df = cbind(PC_df, meta_m)
gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=ko_strategy, color=ko_gene)) +
geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
scale_shape_manual(values = c(7, 15, 16, 17)) +
xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) +
ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) +
ggtitle(cur_mc) +
guides(
color = guide_legend(title = NULL, order = 1),
shape = guide_legend(title = NULL, order = 2)
) +
theme(
legend.margin = margin(0, 0, 0, 0),
legend.box.just = "left",
legend.direction = "vertical"
)
print(gPC)
gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=run_id_short, color=ko_gene)) +
geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
scale_shape_manual(values = c(7, 15, 16, 17)) +
xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) +
ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) +
ggtitle(cur_mc) +
guides(
color = guide_legend(title = NULL, order = 1),
shape = guide_legend(title = NULL, order = 2)
) +
theme(
legend.margin = margin(0, 0, 0, 0),
legend.box.just = "left",
legend.direction = "vertical"
)
print(gPC)
gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=run_id_short, color=ko_gene,
alpha = ifelse(ko_gene == "WT", 1, 0.8))) +
geom_point(size=2.5) +
scale_color_brewer(palette = "Dark2") +
scale_shape_manual(values = c(7, 15, 16, 17)) +
xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) +
ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) +
ggtitle(cur_mc) + guides(alpha = "none") +
guides(
color = guide_legend(title = NULL, order = 1),
shape = guide_legend(title = NULL, order = 2)
) +
theme(
legend.margin = margin(0, 0, 0, 0),
legend.box.just = "left",
legend.direction = "vertical"
)
print(gPC)
table(meta_m$run_id, meta_m$ko_gene)
}## [1] "ExM_nor"
##
## ISL1_CE ISL1_KO ISL1_PTC ISL1_WT
## CE 3 0 0 0
## KO 0 3 0 0
## PTC 0 0 3 0
## WT 0 0 0 3
##
## 1
## 12
##
## CE KO PTC WT
## CE 3 0 0 0
## KO 0 3 0 0
## PTC 0 0 3 0
## WT 0 0 0 3
## [1] "PrS_hyp"
##
## CE_E06 CE_G07 CE_H06 KO_A03 KO_B01 KO_B02 PTC_A09 PTC_D10 PTC_G09 WT_1
## CE 1 1 1 0 0 0 0 0 0 0
## KO 0 0 0 1 1 1 0 0 0 0
## PTC 0 0 0 0 0 0 1 1 1 0
## WT 0 0 0 0 0 0 0 0 0 1
##
## WT_2 WT_3
## CE 0 0
## KO 0 0
## PTC 0 0
## WT 1 1
##
## 1
## 12
##
## CE KO PTC WT
## CE 3 0 0 0
## KO 0 3 0 0
## PTC 0 0 3 0
## WT 0 0 0 3
## [1] "PrS_nor"
##
## CE_E06 CE_G07 CE_H06 FOSB_CE FOSB_KO FOSB_PTC GCM1_CE GCM1_KO GCM1_PTC
## CE 1 1 1 3 0 0 3 0 0
## KO 0 0 0 0 3 0 0 3 0
## PTC 0 0 0 0 0 3 0 0 3
## WT 0 0 0 0 0 0 0 0 0
##
## GCM1_WT GRHL1_CE GRHL1_KO GRHL1_PTC KO_A03 KO_B01 KO_B02 KOLF2_FOSB
## CE 0 3 0 0 0 0 0 0
## KO 0 0 3 0 1 1 1 0
## PTC 0 0 0 3 0 0 0 0
## WT 3 0 0 0 0 0 0 1
##
## KOLF2_GRHL1 KOLF2_POU2F3 POU2F3_CE POU2F3_KO POU2F3_PTC PPARG_CE PPARG_KO
## CE 0 0 3 0 0 3 0
## KO 0 0 0 3 0 0 3
## PTC 0 0 0 0 3 0 0
## WT 1 1 0 0 0 0 0
##
## PPARG_PTC PPARG_WT PTC_A09 PTC_D10 PTC_G09 WT_1 WT_2 WT_3
## CE 0 0 0 0 0 0 0 0
## KO 0 0 0 0 0 0 0 0
## PTC 3 0 1 1 1 0 0 0
## WT 0 3 0 0 0 1 1 1
##
## 1
## 66
##
## CE KO PTC WT
## CE 18 0 0 0
## KO 0 18 0 0
## PTC 0 0 18 0
## WT 0 3 0 9
## [1] "The overlap between WT and KO is not real overlap. It is due to a different part of the filename contains substring 'KO' in these three samples:"
## [1] "KOLF2_GRHL1_2_GT23-10492_TTACAGGA-GCTTGTCA_S6_L001"
## [2] "KOLF2_FOSB_1_GT23-10493_GACCTGAA-CTCACCAA_S32_L001"
## [3] "KOLF2_POU2F3_1_GT23-10491_CCGTGAAG-ATCCACTG_S38_L001"
Use each combination of (model system, condition) as DE group.
Run analysis for each DE group separately.
For the output files, create two versions, one with direct information, one with padj set to NA for the genes when the number of 0 per test is >=4.
In more details, for both versions of the outputs, make it one file per knockout gene per DE group (DE group can be model system, model system + condition, model system + run_id, model system + day, etc.). The first version of the output files contains
gene id, gene symbol
and for each knockout strategy:
baseMean, log2FoldChange, lfcSE, stat, pvalue, padj
The second version of the output files contains
gene id, gene symbol, chr, strand, position
and for each knockout strategy:
log2FoldChange, pvalue, padj (set to be NA if the number of 0 per test >= 4),
In addition, save out a separate file of the sample size for each comparison. This needs to be by DE group.
df_size = NULL
release_name = "2023_12_JAX_RNAseq_ExtraEmbryonic"
output_dir = file.path("results", release_name)
raw_output_dir = file.path(output_dir, "raw")
processed_output_dir = file.path(output_dir, "processed")
if (!file.exists(raw_output_dir)){
dir.create(raw_output_dir, recursive = TRUE)
}
if (!file.exists(processed_output_dir)){
dir.create(processed_output_dir, recursive = TRUE)
}unique_model_ss = unique(meta$model_system)
unique_model_ss = sort(unique_model_ss)
unique_model_ss## [1] "ExM" "PrS"
for (model1 in unique_model_ss){
print(model1)
sample2kp = which(meta$model_system == model1)
cts_dat_m = cts_dat[,sample2kp]
meta_m = meta[sample2kp,]
stopifnot(all(meta_m$file_id == colnames(cts_dat_m)))
print(table(meta_m$file_id == colnames(cts_dat_m), useNA="ifany"))
## Generate sample information matrix
cols2kp = c("model_system", "condition", "ko_strategy", "ko_gene", "run_id",
"run_id_short", "q75", "timepoint")
sample_info = meta_m[,cols2kp]
colnames(sample_info)[which(colnames(sample_info)=="timepoint")] = "day"
dim(sample_info)
sample_info[1:2,]
print(table(sample_info$ko_strategy, sample_info$ko_gene))
sample_info$ko_group = paste(sample_info$ko_gene,
sample_info$ko_strategy, sep="_")
print(table(sample_info$ko_group))
# use the combination of (model system, condition) as DE group
sample_info$model_day = paste0(sample_info$model_system,
"_day_",
as.character(sample_info$day))
sample_info$de_grp = paste0(sample_info$model_day, "_",
sample_info$condition)
sorted_de_grps = sort(unique(sample_info$de_grp))
sorted_de_grps
for(d_group in sorted_de_grps){
print(d_group)
dg_index = which(sample_info$de_grp==d_group)
cts_dat_m_dg = cts_dat_m[, dg_index]
sample_info_dg = sample_info[dg_index, ]
stopifnot(all(sample_info_dg$de_grp==d_group))
print("-----------------------------------")
print(paste0("Model system: ", model1))
print(sprintf("DE analysis group %s DESeq2 results:", d_group))
print("-----------------------------------")
# do two steps of filtering
# first, filter based on q75 of each gene
# second, filter based on whether each gene is expressed in at least 4 cells
dg_q75 = apply(cts_dat_m_dg, 1, quantile, probs=0.75)
cts_dat_m_dg = cts_dat_m_dg[dg_q75 > 0,]
print(sprintf("After filtering by gene expression q75, the number of genes reduces from %d to %d",
length(dg_q75), nrow(cts_dat_m_dg)))
n_pos = rowSums(cts_dat_m_dg>0)
cts_dat_m_dg = cts_dat_m_dg[n_pos >= 4,]
if (length(n_pos)==nrow(cts_dat_m_dg)){
print("After filtering by nonzero expression in at least 4 samples, the number of genes does not change.")
}else{
print(sprintf("After filtering by nonzero expression in at least 4 samples, the number of genes reduces from %d to %d",
length(n_pos), nrow(cts_dat_m_dg)))
}
# update the q75 based on only genes used in the process
sample_info_dg$q75 = apply(cts_dat_m_dg, 2, quantile, probs = 0.75)
sample_info_dg$log_q75 = log(sample_info_dg$q75)
if (length(unique(sample_info_dg$run_id))==1){
dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg,
~ ko_group + log_q75)
}else{
dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg,
~ ko_group + run_id + log_q75)
}
start.time <- Sys.time()
dds = DESeq(dds)
end.time <- Sys.time()
time.taken <- end.time - start.time
print(time.taken)
## association with read-depth
res_rd = results(dds, name = "log_q75")
res_rd = as.data.frame(res_rd)
dim(res_rd)
res_rd[1:2,]
table(is.na(res_rd$padj))
g0 = ggplot(res_rd %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle(paste0(d_group, " read depth"))
print(g0)
## Rerun the analysis without read-depth if it is not significant for
## most genes, and the number of samples is small
## i.e., proportion of non-null in the distribution is smaller than 0.01
## (this needs to be restricted to the genes with not NA adjusted pvalue)
## or if smaller than 0.1 and the number of samples is not greater than 6
pi_1_rd = max(0, mean(res_rd$pvalue[!is.na(res_rd$padj)] < 0.05) - 0.05)
pi_1_rd = max(pi_1_rd, 0, 1 - 2*mean(res_rd$pvalue[!is.na(res_rd$padj)] > 0.5))
print(sprintf("prop of non-null for rd: %.2e", pi_1_rd))
if(pi_1_rd < 0.01 || (ncol(cts_dat_m_dg) <= 6 && pi_1_rd < 0.1 )){
print("rerun DESeq2 without read depth")
include_rd = 0
if (length(unique(sample_info_dg$run_id))==1){
dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg,
~ ko_group)
}else{
dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg,
~ ko_group + run_id)
}
start.time <- Sys.time()
dds = DESeq(dds)
end.time <- Sys.time()
time.taken <- end.time - start.time
print(time.taken)
}else{
include_rd = 1
}
## association with run_id
if (length(unique(sample_info_dg$run_id))>1){
if(include_rd){
dds_lrt = DESeq(dds, test="LRT", reduced = ~ ko_group + log_q75)
}else{
dds_lrt = DESeq(dds, test="LRT", reduced = ~ ko_group)
}
res_lrt = results(dds_lrt)
res_run_id = as.data.frame(res_lrt)
dim(res_run_id)
res_run_id[1:2,]
table(is.na(res_run_id$padj))
g0 = ggplot(res_run_id %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle(paste0(d_group, " Run ID"))
print(g0)
}
## DE analysis for each KO gene
table(sample_info_dg$ko_gene)
table(sample_info_dg$ko_gene, sample_info_dg$ko_strategy)
genos = setdiff(unique(sample_info_dg$ko_gene), "WT")
genos = sort(genos)
genos
ko_grp = c("CE", "KO", "PTC")
for(geno in genos){
res_geno_df = NULL
res_geno_reduced_df = NULL
res_geno = list()
for(k_grp1 in ko_grp){
res_g1 = results(dds, contrast = c("ko_group",
paste0(geno, "_", k_grp1), "WT_WT"))
res_g1 = signif(data.frame(res_g1), 3)
# add a column to record the number of zeros per test
test_index = which(sample_info_dg$ko_group%in%c(paste0(geno, "_", k_grp1), "WT_WT"))
cts_dat_m_dg_test = cts_dat_m_dg[, test_index]
n_zero = rowSums(cts_dat_m_dg_test==0)
res_g1$n_zeros = n_zero
# record the number of samples involved in the comparison
test_ko_group_vec = sample_info_dg$ko_group[test_index]
df_size = rbind(df_size,
c(d_group,
paste0(geno, "_", k_grp1),
sum(test_ko_group_vec!="WT_WT"),
sum(test_ko_group_vec=="WT_WT")))
# prepare a processed version of output
res_g1_reduced = res_g1[, c("log2FoldChange", "pvalue", "padj", "n_zeros")]
res_g1_reduced$padj[which(res_g1_reduced$n_zeros>=4)] = NA
if ((sum(test_ko_group_vec!="WT_WT")==1)|(sum(test_ko_group_vec=="WT_WT")==1)){
res_g1_reduced$padj = NA
}
res_g1_reduced = res_g1_reduced[, c("log2FoldChange", "pvalue", "padj")]
res_geno[[k_grp1]] = res_g1_reduced
if ((sum(test_ko_group_vec!="WT_WT")>1) & (sum(test_ko_group_vec=="WT_WT")>1)){
g1 = ggplot(res_g1_reduced %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle(paste0(d_group, " ", geno, "_", k_grp1))
print(g1)
}
tag1 = sprintf("%s_%s_", geno, k_grp1)
colnames(res_g1) = paste0(tag1, colnames(res_g1))
colnames(res_g1_reduced) = paste0(tag1, colnames(res_g1_reduced))
if(is.null(res_geno_df)){
res_geno_df = res_g1
}else if(!is.null(res_geno_df)){
stopifnot(all(rownames(res_geno_df) == rownames(res_g1)))
res_geno_df = cbind(res_geno_df, res_g1)
}
if(is.null(res_geno_reduced_df)){
res_geno_reduced_df = res_g1_reduced
}else if(!is.null(res_geno_reduced_df)){
stopifnot(all(rownames(res_geno_reduced_df) == rownames(res_g1_reduced)))
res_geno_reduced_df = cbind(res_geno_reduced_df, res_g1_reduced)
}
}
get_index <- function(x){
which(x$padj < 0.05 & abs(x$log2FoldChange) > log2(1.5))
}
w_ce = get_index(res_geno[["CE"]])
w_ko = get_index(res_geno[["KO"]])
w_ptc = get_index(res_geno[["PTC"]])
print(paste0("DE group ", d_group, " under KO gene ", geno, ":"))
print(paste0("number of DE genes from knock out strategy CE: ",
as.character(length(w_ce))))
print(paste0("number of DE genes from knock out strategy KO: ",
as.character(length(w_ko))))
print(paste0("number of DE genes from knock out strategy PTC: ",
as.character(length(w_ptc))))
ce_ko_overlap = length(intersect(rownames(res_geno[["CE"]])[w_ce],
rownames(res_geno[["KO"]])[w_ko]))
ko_ptc_overlap = length(intersect(rownames(res_geno[["KO"]])[w_ko],
rownames(res_geno[["PTC"]])[w_ptc]))
ptc_ce_overlap = length(intersect(rownames(res_geno[["PTC"]])[w_ptc],
rownames(res_geno[["CE"]])[w_ce]))
print(paste0("number of common DE genes by knock out strategies CE and KO: ",
as.character(ce_ko_overlap)))
print(paste0("number of common DE genes by knock out strategies KO and PTC: ",
as.character(ko_ptc_overlap)))
print(paste0("number of common DE genes by knock out strategies PTC and CE: ",
as.character(ptc_ce_overlap)))
if (max(length(w_ce), length(w_ko), length(w_ptc)) > 0){
m = make_comb_mat(list("CE" = rownames(res_geno[["CE"]])[w_ce],
"KO" = rownames(res_geno[["KO"]])[w_ko],
"PTC" = rownames(res_geno[["PTC"]])[w_ptc]))
g_up = UpSet(m)
print(g_up)
}
df1 = data.frame(padj_CE = res_geno[["CE"]]$padj,
padj_KO = res_geno[["KO"]]$padj,
padj_PTC = res_geno[["PTC"]]$padj)
cr1 = cor(df1$padj_PTC, df1$padj_CE, method = "spearman", use="pair")
cr2 = cor(df1$padj_PTC, df1$padj_KO, method = "spearman", use="pair")
g4 = ggplot(data = df1, mapping = aes(x = -log10(padj_PTC),
y = -log10(padj_CE))) +
geom_pointdensity() +
ggtitle(sprintf("%s, %s\nSpearman r = %.2f", d_group, geno, cr1)) +
scale_color_viridis()
g5 = ggplot(data = df1, mapping = aes(x = -log10(padj_PTC),
y = -log10(padj_KO))) +
geom_pointdensity() +
ggtitle(sprintf("%s, %s\nSpearman r = %.2f", d_group, geno, cr2)) +
scale_color_viridis()
print(g4)
print(g5)
dim(res_geno_df)
res_geno_df[1:2,]
dim(res_geno_reduced_df)
res_geno_reduced_df[1:2,]
res_df = data.frame(gene_ID=rownames(res_geno_df),
res_geno_df)
dim(res_df)
res_df[1:2,]
res_reduced_gene_anno = gene_anno[match(rownames(res_geno_reduced_df),
gene_anno$ensembl_gene_id), ]
stopifnot(all(rownames(res_geno_reduced_df)==res_reduced_gene_anno$ensembl_gene_id))
# set gene hgnc_symbol that equal "" to NA
hgnc_symbol_vec = res_reduced_gene_anno$hgnc_symbol
hgnc_symbol_vec[which(hgnc_symbol_vec=="")] = NA
res_reduced_df = data.frame(gene_ID=rownames(res_geno_reduced_df),
hgnc_symbol=hgnc_symbol_vec,
chr=res_reduced_gene_anno$chr,
strand=res_reduced_gene_anno$strand,
start=res_reduced_gene_anno$start,
end=res_reduced_gene_anno$end,
res_geno_reduced_df)
print("hgnc_symbol empty string and NA tables:")
print(table(res_reduced_df$hgnc_symbol=="", useNA="ifany"))
print(table(is.na(res_reduced_df$hgnc_symbol)))
dim(res_reduced_df)
res_reduced_df[1:2,]
setting_name = unique(paste0(sample_info_dg$model_day, "_",
sample_info_dg$condition))
fwrite(res_df,
sprintf("%s/%s_%s_%s_DEseq2_raw.tsv",
raw_output_dir, release_name, setting_name, geno),
sep="\t")
fwrite(res_reduced_df,
sprintf("%s/%s_%s_%s_DEseq2.tsv",
processed_output_dir, release_name, setting_name, geno),
sep="\t")
}
}
}## [1] "ExM"
##
## TRUE
## 12
##
## ISL1 WT
## CE 3 0
## KO 3 0
## PTC 3 0
## WT 0 3
##
## ISL1_CE ISL1_KO ISL1_PTC WT_WT
## 3 3 3 3
## [1] "ExM_day_6_nor"
## [1] "-----------------------------------"
## [1] "Model system: ExM"
## [1] "DE analysis group ExM_day_6_nor DESeq2 results:"
## [1] "-----------------------------------"
## [1] "After filtering by gene expression q75, the number of genes reduces from 25835 to 24876"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes reduces from 24876 to 23842"
## Time difference of 1.552859 mins
## [1] "prop of non-null for rd: 0.00e+00"
## [1] "rerun DESeq2 without read depth"
## Time difference of 8.436475 secs
## [1] "DE group ExM_day_6_nor under KO gene ISL1:"
## [1] "number of DE genes from knock out strategy CE: 652"
## [1] "number of DE genes from knock out strategy KO: 405"
## [1] "number of DE genes from knock out strategy PTC: 432"
## [1] "number of common DE genes by knock out strategies CE and KO: 343"
## [1] "number of common DE genes by knock out strategies KO and PTC: 324"
## [1] "number of common DE genes by knock out strategies PTC and CE: 380"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18935 4907
##
## FALSE TRUE
## 18935 4907
## [1] "PrS"
##
## TRUE
## 78
##
## EPAS1 FOSB GCM1 GRHL1 POU2F3 PPARG WT
## CE 6 3 3 3 3 3 0
## KO 6 3 3 3 3 3 0
## PTC 6 3 3 3 3 3 0
## WT 0 0 0 0 0 0 15
##
## EPAS1_CE EPAS1_KO EPAS1_PTC FOSB_CE FOSB_KO FOSB_PTC GCM1_CE
## 6 6 6 3 3 3 3
## GCM1_KO GCM1_PTC GRHL1_CE GRHL1_KO GRHL1_PTC POU2F3_CE POU2F3_KO
## 3 3 3 3 3 3 3
## POU2F3_PTC PPARG_CE PPARG_KO PPARG_PTC WT_WT
## 3 3 3 3 15
## [1] "PrS_day_6_hyp"
## [1] "-----------------------------------"
## [1] "Model system: PrS"
## [1] "DE analysis group PrS_day_6_hyp DESeq2 results:"
## [1] "-----------------------------------"
## [1] "After filtering by gene expression q75, the number of genes reduces from 25835 to 23911"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes reduces from 23911 to 23326"
## Time difference of 9.442636 secs
## [1] "prop of non-null for rd: 8.49e-02"
## [1] "DE group PrS_day_6_hyp under KO gene EPAS1:"
## [1] "number of DE genes from knock out strategy CE: 4196"
## [1] "number of DE genes from knock out strategy KO: 2140"
## [1] "number of DE genes from knock out strategy PTC: 2553"
## [1] "number of common DE genes by knock out strategies CE and KO: 1883"
## [1] "number of common DE genes by knock out strategies KO and PTC: 1628"
## [1] "number of common DE genes by knock out strategies PTC and CE: 1960"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18654 4672
##
## FALSE TRUE
## 18654 4672
## [1] "PrS_day_6_nor"
## [1] "-----------------------------------"
## [1] "Model system: PrS"
## [1] "DE analysis group PrS_day_6_nor DESeq2 results:"
## [1] "-----------------------------------"
## [1] "After filtering by gene expression q75, the number of genes reduces from 25835 to 23493"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes does not change."
## Time difference of 1.405563 mins
## [1] "prop of non-null for rd: 1.30e-01"
## [1] "DE group PrS_day_6_nor under KO gene EPAS1:"
## [1] "number of DE genes from knock out strategy CE: 1372"
## [1] "number of DE genes from knock out strategy KO: 40"
## [1] "number of DE genes from knock out strategy PTC: 304"
## [1] "number of common DE genes by knock out strategies CE and KO: 37"
## [1] "number of common DE genes by knock out strategies KO and PTC: 29"
## [1] "number of common DE genes by knock out strategies PTC and CE: 236"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18811 4682
##
## FALSE TRUE
## 18811 4682
## [1] "DE group PrS_day_6_nor under KO gene FOSB:"
## [1] "number of DE genes from knock out strategy CE: 409"
## [1] "number of DE genes from knock out strategy KO: 847"
## [1] "number of DE genes from knock out strategy PTC: 121"
## [1] "number of common DE genes by knock out strategies CE and KO: 324"
## [1] "number of common DE genes by knock out strategies KO and PTC: 103"
## [1] "number of common DE genes by knock out strategies PTC and CE: 43"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18811 4682
##
## FALSE TRUE
## 18811 4682
## [1] "DE group PrS_day_6_nor under KO gene GCM1:"
## [1] "number of DE genes from knock out strategy CE: 5299"
## [1] "number of DE genes from knock out strategy KO: 4486"
## [1] "number of DE genes from knock out strategy PTC: 4255"
## [1] "number of common DE genes by knock out strategies CE and KO: 4013"
## [1] "number of common DE genes by knock out strategies KO and PTC: 3770"
## [1] "number of common DE genes by knock out strategies PTC and CE: 3823"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18811 4682
##
## FALSE TRUE
## 18811 4682
## [1] "DE group PrS_day_6_nor under KO gene GRHL1:"
## [1] "number of DE genes from knock out strategy CE: 3439"
## [1] "number of DE genes from knock out strategy KO: 2483"
## [1] "number of DE genes from knock out strategy PTC: 5095"
## [1] "number of common DE genes by knock out strategies CE and KO: 1980"
## [1] "number of common DE genes by knock out strategies KO and PTC: 2044"
## [1] "number of common DE genes by knock out strategies PTC and CE: 3057"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18811 4682
##
## FALSE TRUE
## 18811 4682
## [1] "DE group PrS_day_6_nor under KO gene POU2F3:"
## [1] "number of DE genes from knock out strategy CE: 2038"
## [1] "number of DE genes from knock out strategy KO: 590"
## [1] "number of DE genes from knock out strategy PTC: 6"
## [1] "number of common DE genes by knock out strategies CE and KO: 462"
## [1] "number of common DE genes by knock out strategies KO and PTC: 4"
## [1] "number of common DE genes by knock out strategies PTC and CE: 3"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18811 4682
##
## FALSE TRUE
## 18811 4682
## [1] "DE group PrS_day_6_nor under KO gene PPARG:"
## [1] "number of DE genes from knock out strategy CE: 4503"
## [1] "number of DE genes from knock out strategy KO: 3029"
## [1] "number of DE genes from knock out strategy PTC: 4014"
## [1] "number of common DE genes by knock out strategies CE and KO: 2690"
## [1] "number of common DE genes by knock out strategies KO and PTC: 2334"
## [1] "number of common DE genes by knock out strategies PTC and CE: 3173"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18811 4682
##
## FALSE TRUE
## 18811 4682
Save the size dataframe out
colnames(df_size) = c("DE_group", "knockout_gene_strategy", "n_ko", "n_WT")
df_size = as.data.frame(df_size)
multi_rows = which(table(df_size$knockout_gene_strategy)>1)
multi_rows## EPAS1_CE EPAS1_KO EPAS1_PTC
## 1 2 3
df_size[which(df_size$knockout_gene_strategy%in%names(multi_rows)),]## DE_group knockout_gene_strategy n_ko n_WT
## 4 PrS_day_6_hyp EPAS1_CE 3 3
## 5 PrS_day_6_hyp EPAS1_KO 3 3
## 6 PrS_day_6_hyp EPAS1_PTC 3 3
## 7 PrS_day_6_nor EPAS1_CE 3 12
## 8 PrS_day_6_nor EPAS1_KO 3 12
## 9 PrS_day_6_nor EPAS1_PTC 3 12
fwrite(df_size,
sprintf("%s/%s_DE_n_samples.tsv", output_dir, release_name),
sep="\t")This is only relevant for model system PrS.
Within model system PrS, for WT samples, only use the WT samples for EPAS1 gene for this analysis.
There are 24 samples involved. 6 WT and 6 EPAS1 knockout samples for each knockout strategy. Half of them are under normoxia and the other half are under hypoxia. All the 24 samples are from run_id “20230918_23-robson-008”.
For the 6 WT samples, we do not know whether they are from 3 single cell clones, with each clone having two groups of cells developed under different conditions. For now, run regular DESeq2 on them.
For the 6 knockout samples under each knockout strategy, they are from three single cell clones, with each clone having two groups of cells developed under different conditions. Run paired DESeq2 on them.
df_size = NULL
meta$ko_group = paste(meta$ko_gene, meta$ko_strategy, sep="_")
relevant_ko_groups = unique(meta$ko_group[which(meta$condition=="hyp")])
relevant_ko_groups = sort(relevant_ko_groups)
relevant_samples = meta$label[which(meta$run_id=="20230918_23-robson-008")]
relevant_samples## [1] "PrS-MOK20012C-G07-nor" "PrS-MOK20012-WT1-hyp" "PrS-MOK20012P-A09-nor"
## [4] "PrS-MOK20012W-B02-hyp" "PrS-MOK20012-WT1-nor" "PrS-MOK20012P-D10-nor"
## [7] "PrS-MOK20012C-E06-nor" "PrS-MOK20012P-G09-hyp" "PrS-MOK20012-WT2-nor"
## [10] "PrS-MOK20012C-E06-hyp" "PrS-MOK20012-WT3-hyp" "PrS-MOK20012C-H06-hyp"
## [13] "PrS-MOK20012-WT2-hyp" "PrS-MOK20012W-B01-nor" "PrS-MOK20012C-G07-hyp"
## [16] "PrS-MOK20012W-A03-nor" "PrS-MOK20012W-B02-nor" "PrS-MOK20012W-B01-hyp"
## [19] "PrS-MOK20012-WT3-nor" "PrS-MOK20012C-H06-nor" "PrS-MOK20012W-A03-hyp"
## [22] "PrS-MOK20012P-G09-nor" "PrS-MOK20012P-A09-hyp" "PrS-MOK20012P-D10-hyp"
for (cur_ko_group in relevant_ko_groups){
# note that gene_group is for indicating which knockout gene that WT samples are for
# for knockout samples, its value is the knockout gene
sample2kp = which((meta$label%in%relevant_samples)&
(meta$ko_group == cur_ko_group))
cts_dat_cond = cts_dat[,sample2kp]
meta_cond = meta[sample2kp,]
dim(cts_dat_cond)
dim(meta_cond)
unique_model_day = unique(paste0(meta_cond$model_system, "_day_",
meta_cond$timepoint))
print("")
print("==========================================")
print(paste0(unique_model_day, ", ", cur_ko_group, " samples"))
print("hyp vs nor DE results:")
print("==========================================")
print("")
stopifnot(all(meta_cond$file_id==colnames(cts_dat_cond)))
table(meta_cond$file_id==colnames(cts_dat_cond), useNA="ifany")
stopifnot(all(meta_cond$ko_group==cur_ko_group))
print(table(meta_cond$condition, meta_cond$run_id_short))
cols2kp = c("model_system", "timepoint", "clonal.label", "condition")
sample_info_cond = meta_cond[,cols2kp]
colnames(sample_info_cond)[which(colnames(sample_info_cond)=="clonal.label")] = "clonal_label"
dim(sample_info_cond)
sample_info_cond[1:2,]
# do two steps of filtering
# first, filter based on q75 of each gene
# second, filter based on whether each gene is expressed in at least 4 cells
cond_q75 = apply(cts_dat_cond, 1, quantile, probs=0.75)
cts_dat_cond = cts_dat_cond[cond_q75 > 0,]
print(sprintf("After filtering by gene expression q75, the number of genes reduces from %d to %d",
length(cond_q75), nrow(cts_dat_cond)))
n_pos = rowSums(cts_dat_cond>0)
cts_dat_cond = cts_dat_cond[n_pos >= 4,]
if (length(n_pos)==nrow(cts_dat_cond)){
print("After filtering by nonzero expression in at least 4 samples, the number of genes does not change.")
}else{
print(sprintf("After filtering by nonzero expression in at least 4 samples, the number of genes reduces from %d to %d",
length(n_pos), nrow(cts_dat_cond)))
}
# update the q75 based on only genes that will be used in the DE analysis
sample_info_cond$q75 = apply(cts_dat_cond, 2, quantile, probs = 0.75)
sample_info_cond$log_q75 = log(sample_info_cond$q75)
if (cur_ko_group=="WT_WT"){
dds = DESeqDataSetFromMatrix(cts_dat_cond, colData=sample_info_cond,
~ condition + log_q75)
}else{
dds = DESeqDataSetFromMatrix(cts_dat_cond, colData=sample_info_cond,
~ clonal_label + log_q75 + condition)
}
start.time <- Sys.time()
dds = DESeq(dds)
end.time <- Sys.time()
time.taken <- end.time - start.time
print(time.taken)
## association with read-depth
res_rd = results(dds, name = "log_q75")
res_rd = as.data.frame(res_rd)
dim(res_rd)
res_rd[1:2,]
table(is.na(res_rd$padj))
g0 = ggplot(res_rd %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle(paste0(unique_model_day, " ", cur_ko_group, " hyp vs nor\nread depth"))
print(g0)
## Rerun the analysis without read-depth if it is not significant for
## most genes, and the number of samples is small
## i.e., proportion of non-null in the distribution is smaller than 0.01
## (this needs to be restricted to the genes with not NA adjusted pvalue)
## or if smaller than 0.1 and the number of samples is not greater than 6
pi_1_rd = max(0, mean(res_rd$pvalue[!is.na(res_rd$padj)] < 0.05) - 0.05)
pi_1_rd = max(pi_1_rd, 0, 1 - 2*mean(res_rd$pvalue[!is.na(res_rd$padj)] > 0.5))
print(sprintf("prop of non-null for rd: %.2e", pi_1_rd))
if(pi_1_rd < 0.01 || (ncol(cts_dat_cond) <= 6 && pi_1_rd < 0.1 )){
print("rerun DESeq2 without read depth")
if (cur_ko_group=="WT_WT"){
dds = DESeqDataSetFromMatrix(cts_dat_cond, colData=sample_info_cond,
~ condition)
}else{
dds = DESeqDataSetFromMatrix(cts_dat_cond, colData=sample_info_cond,
~ clonal_label + condition)
}
dds = DESeq(dds)
}
## DE analysis between two conditions
res_g1 = results(dds, contrast = c("condition", "hyp", "nor"))
res_g1 = signif(data.frame(res_g1), 3)
# add a column to record the number of zeros per test
n_zero = rowSums(cts_dat_cond==0)
res_g1$n_zeros = n_zero
# record the number of samples involved in the comparison
df_size = rbind(df_size,
c(unique_model_day,
cur_ko_group,
sum(sample_info_cond$condition=="hyp"),
sum(sample_info_cond$condition=="nor")))
# prepare a processed version of output
res_g1_reduced = res_g1[, c("log2FoldChange", "pvalue", "padj", "n_zeros")]
res_g1_reduced$padj[which(res_g1_reduced$n_zeros>=4)] = NA
res_g1_reduced = res_g1_reduced[, c("log2FoldChange", "pvalue", "padj")]
n_DE = sum(res_g1_reduced$padj < 0.05 & abs(res_g1_reduced$log2FoldChange) > log2(1.5),
na.rm = TRUE)
print(paste0(unique_model_day, " between nor and hyp conditions, ", cur_ko_group, ":"))
print(sprintf("number of DE genes: %d", n_DE))
g1 = ggplot(res_g1_reduced %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle(paste0(unique_model_day, " ", cur_ko_group, "\nhyp vs nor"))
print(g1)
tag1 = "hyp_vs_nor_"
colnames(res_g1) = paste0(tag1, colnames(res_g1))
colnames(res_g1_reduced) = paste0(tag1, colnames(res_g1_reduced))
res_df = data.frame(gene_ID=rownames(res_g1),
res_g1)
dim(res_df)
res_df[1:2,]
res_reduced_gene_anno = gene_anno[match(rownames(res_g1_reduced),
gene_anno$ensembl_gene_id), ]
stopifnot(all(rownames(res_g1_reduced)==res_reduced_gene_anno$ensembl_gene_id))
# set gene hgnc_symbol that equal "" to NA
hgnc_symbol_vec = res_reduced_gene_anno$hgnc_symbol
hgnc_symbol_vec[which(hgnc_symbol_vec=="")] = NA
res_reduced_df = data.frame(gene_ID=rownames(res_g1_reduced),
hgnc_symbol=hgnc_symbol_vec,
chr=res_reduced_gene_anno$chr,
strand=res_reduced_gene_anno$strand,
start=res_reduced_gene_anno$start,
end=res_reduced_gene_anno$end,
res_g1_reduced)
print("hgnc_symbol empty string and NA tables:")
print(table(res_reduced_df$hgnc_symbol=="", useNA="ifany"))
print(table(is.na(res_reduced_df$hgnc_symbol)))
dim(res_reduced_df)
res_reduced_df[1:2,]
fwrite(res_df,
sprintf("%s/%s_%s_%s_DEseq2_raw.tsv",
raw_output_dir, release_name,
paste0(unique_model_day, "_", cur_ko_group), "hyp_vs_nor"),
sep="\t")
fwrite(res_reduced_df,
sprintf("%s/%s_%s_%s_DEseq2.tsv",
processed_output_dir, release_name,
paste0(unique_model_day, "_", cur_ko_group), "hyp_vs_nor"),
sep="\t")
}## [1] ""
## [1] "=========================================="
## [1] "PrS_day_6, EPAS1_CE samples"
## [1] "hyp vs nor DE results:"
## [1] "=========================================="
## [1] ""
##
## robson-008
## hyp 3
## nor 3
## [1] "After filtering by gene expression q75, the number of genes reduces from 25835 to 23378"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes reduces from 23378 to 20937"
## Time difference of 29.57336 secs
## [1] "prop of non-null for rd: 4.29e-01"
## [1] "PrS_day_6 between nor and hyp conditions, EPAS1_CE:"
## [1] "number of DE genes: 3518"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 17474 3463
##
## FALSE TRUE
## 17474 3463
## [1] ""
## [1] "=========================================="
## [1] "PrS_day_6, EPAS1_KO samples"
## [1] "hyp vs nor DE results:"
## [1] "=========================================="
## [1] ""
##
## robson-008
## hyp 3
## nor 3
## [1] "After filtering by gene expression q75, the number of genes reduces from 25835 to 23542"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes reduces from 23542 to 21177"
## Time difference of 1.055004 mins
## [1] "prop of non-null for rd: 0.00e+00"
## [1] "rerun DESeq2 without read depth"
## [1] "PrS_day_6 between nor and hyp conditions, EPAS1_KO:"
## [1] "number of DE genes: 2518"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 17534 3643
##
## FALSE TRUE
## 17534 3643
## [1] ""
## [1] "=========================================="
## [1] "PrS_day_6, EPAS1_PTC samples"
## [1] "hyp vs nor DE results:"
## [1] "=========================================="
## [1] ""
##
## robson-008
## hyp 3
## nor 3
## [1] "After filtering by gene expression q75, the number of genes reduces from 25835 to 23382"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes reduces from 23382 to 21031"
## Time difference of 54.79509 secs
## [1] "prop of non-null for rd: 0.00e+00"
## [1] "rerun DESeq2 without read depth"
## [1] "PrS_day_6 between nor and hyp conditions, EPAS1_PTC:"
## [1] "number of DE genes: 3664"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 17464 3567
##
## FALSE TRUE
## 17464 3567
## [1] ""
## [1] "=========================================="
## [1] "PrS_day_6, WT_WT samples"
## [1] "hyp vs nor DE results:"
## [1] "=========================================="
## [1] ""
##
## robson-008
## hyp 3
## nor 3
## [1] "After filtering by gene expression q75, the number of genes reduces from 25835 to 23704"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes reduces from 23704 to 21221"
## Time difference of 9.070153 secs
## [1] "prop of non-null for rd: 0.00e+00"
## [1] "rerun DESeq2 without read depth"
## [1] "PrS_day_6 between nor and hyp conditions, WT_WT:"
## [1] "number of DE genes: 6724"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 17531 3690
##
## FALSE TRUE
## 17531 3690
colnames(df_size) = c("model_system_day", "knockout_gene_strategy", "n_hyp", "n_nor")
fwrite(df_size,
sprintf("%s/%s_DE_hyp_vs_nor_n_samples.tsv", output_dir, release_name),
sep="\t")gc()## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 8143434 435.0 12622100 674.1 NA 12622100 674.1
## Vcells 34191096 260.9 96789315 738.5 65536 96789315 738.5
sessionInfo()## R version 4.2.3 (2023-03-15)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] readxl_1.4.3 UpSetR_1.4.0
## [3] ComplexHeatmap_2.14.0 data.table_1.15.4
## [5] dplyr_1.1.4 ggpointdensity_0.1.0
## [7] viridis_0.6.5 viridisLite_0.4.2
## [9] DESeq2_1.38.3 SummarizedExperiment_1.28.0
## [11] Biobase_2.58.0 MatrixGenerics_1.10.0
## [13] matrixStats_1.3.0 GenomicRanges_1.50.2
## [15] GenomeInfoDb_1.34.9 IRanges_2.32.0
## [17] S4Vectors_0.36.2 BiocGenerics_0.44.0
## [19] RColorBrewer_1.1-3 MASS_7.3-60
## [21] stringr_1.5.1 ggpubr_0.6.0
## [23] ggrepel_0.9.5 ggplot2_3.5.1
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-8 bit64_4.0.5 doParallel_1.0.17
## [4] httr_1.4.7 tools_4.2.3 backports_1.5.0
## [7] bslib_0.8.0 utf8_1.2.4 R6_2.5.1
## [10] DBI_1.2.3 colorspace_2.1-1 GetoptLong_1.0.5
## [13] withr_3.0.1 tidyselect_1.2.1 gridExtra_2.3
## [16] bit_4.0.5 compiler_4.2.3 cli_3.6.3
## [19] Cairo_1.6-2 DelayedArray_0.24.0 labeling_0.4.3
## [22] sass_0.4.9 scales_1.3.0 digest_0.6.37
## [25] rmarkdown_2.28 XVector_0.38.0 pkgconfig_2.0.3
## [28] htmltools_0.5.8.1 highr_0.11 fastmap_1.2.0
## [31] GlobalOptions_0.1.2 rlang_1.1.4 rstudioapi_0.16.0
## [34] RSQLite_2.3.7 farver_2.1.2 shape_1.4.6.1
## [37] jquerylib_0.1.4 generics_0.1.3 jsonlite_1.8.8
## [40] BiocParallel_1.32.6 car_3.1-2 RCurl_1.98-1.16
## [43] magrittr_2.0.3 GenomeInfoDbData_1.2.9 Matrix_1.5-4.1
## [46] Rcpp_1.0.13 munsell_0.5.1 fansi_1.0.6
## [49] abind_1.4-5 lifecycle_1.0.4 stringi_1.8.4
## [52] yaml_2.3.10 carData_3.0-5 zlibbioc_1.44.0
## [55] plyr_1.8.9 blob_1.2.4 parallel_4.2.3
## [58] crayon_1.5.3 lattice_0.22-6 Biostrings_2.66.0
## [61] annotate_1.76.0 circlize_0.4.16 KEGGREST_1.38.0
## [64] locfit_1.5-9.10 knitr_1.48 pillar_1.9.0
## [67] rjson_0.2.21 ggsignif_0.6.4 geneplotter_1.76.0
## [70] codetools_0.2-20 XML_3.99-0.17 glue_1.7.0
## [73] evaluate_0.24.0 foreach_1.5.2 vctrs_0.6.5
## [76] png_0.1-8 cellranger_1.1.0 gtable_0.3.5
## [79] purrr_1.0.2 tidyr_1.3.1 clue_0.3-65
## [82] cachem_1.1.0 xfun_0.47 xtable_1.8-4
## [85] broom_1.0.6 rstatix_0.7.2 tibble_3.2.1
## [88] iterators_1.0.14 AnnotationDbi_1.60.2 memoise_2.0.1
## [91] cluster_2.1.6